Importing necessary Libraries
In [56]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
# Libraries to help with data visualization
#%pip install seaborn
import matplotlib.pyplot as plt
import seaborn as sns
# to split the data into train and test
from sklearn.model_selection import train_test_split
# to build linear regression_model
from sklearn.linear_model import LinearRegression
# to check model performance
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
# to build linear regression_model using statsmodels
import statsmodels.api as sm
Loading Data set
In [57]:
df=pd.read_csv('used_device_data.csv')
Data Overview
In [58]:
df.head()
Out[58]:
| brand_name | os | screen_size | 4g | 5g | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | release_year | days_used | normalized_used_price | normalized_new_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Honor | Android | 14.50 | yes | no | 13.0 | 5.0 | 64.0 | 3.0 | 3020.0 | 146.0 | 2020 | 127 | 4.307572 | 4.715100 |
| 1 | Honor | Android | 17.30 | yes | yes | 13.0 | 16.0 | 128.0 | 8.0 | 4300.0 | 213.0 | 2020 | 325 | 5.162097 | 5.519018 |
| 2 | Honor | Android | 16.69 | yes | yes | 13.0 | 8.0 | 128.0 | 8.0 | 4200.0 | 213.0 | 2020 | 162 | 5.111084 | 5.884631 |
| 3 | Honor | Android | 25.50 | yes | yes | 13.0 | 8.0 | 64.0 | 6.0 | 7250.0 | 480.0 | 2020 | 345 | 5.135387 | 5.630961 |
| 4 | Honor | Android | 15.32 | yes | no | 13.0 | 8.0 | 64.0 | 3.0 | 5000.0 | 185.0 | 2020 | 293 | 4.389995 | 4.947837 |
In [59]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3454 entries, 0 to 3453 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 brand_name 3454 non-null object 1 os 3454 non-null object 2 screen_size 3454 non-null float64 3 4g 3454 non-null object 4 5g 3454 non-null object 5 main_camera_mp 3275 non-null float64 6 selfie_camera_mp 3452 non-null float64 7 int_memory 3450 non-null float64 8 ram 3450 non-null float64 9 battery 3448 non-null float64 10 weight 3447 non-null float64 11 release_year 3454 non-null int64 12 days_used 3454 non-null int64 13 normalized_used_price 3454 non-null float64 14 normalized_new_price 3454 non-null float64 dtypes: float64(9), int64(2), object(4) memory usage: 404.9+ KB
In [60]:
df.describe()
Out[60]:
| screen_size | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | release_year | days_used | normalized_used_price | normalized_new_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3454.000000 | 3275.000000 | 3452.000000 | 3450.000000 | 3450.000000 | 3448.000000 | 3447.000000 | 3454.000000 | 3454.000000 | 3454.000000 | 3454.000000 |
| mean | 13.713115 | 9.460208 | 6.554229 | 54.573099 | 4.036122 | 3133.402697 | 182.751871 | 2015.965258 | 674.869716 | 4.364712 | 5.233107 |
| std | 3.805280 | 4.815461 | 6.970372 | 84.972371 | 1.365105 | 1299.682844 | 88.413228 | 2.298455 | 248.580166 | 0.588914 | 0.683637 |
| min | 5.080000 | 0.080000 | 0.000000 | 0.010000 | 0.020000 | 500.000000 | 69.000000 | 2013.000000 | 91.000000 | 1.536867 | 2.901422 |
| 25% | 12.700000 | 5.000000 | 2.000000 | 16.000000 | 4.000000 | 2100.000000 | 142.000000 | 2014.000000 | 533.500000 | 4.033931 | 4.790342 |
| 50% | 12.830000 | 8.000000 | 5.000000 | 32.000000 | 4.000000 | 3000.000000 | 160.000000 | 2015.500000 | 690.500000 | 4.405133 | 5.245892 |
| 75% | 15.340000 | 13.000000 | 8.000000 | 64.000000 | 4.000000 | 4000.000000 | 185.000000 | 2018.000000 | 868.750000 | 4.755700 | 5.673718 |
| max | 30.710000 | 48.000000 | 32.000000 | 1024.000000 | 12.000000 | 9720.000000 | 855.000000 | 2020.000000 | 1094.000000 | 6.619433 | 7.847841 |
Exploratory Data Analysis (EDA)
In [61]:
sns.histplot(data=df,x='normalized_used_price',kde=True)
plt.show()
sns.histplot(data=df,x='normalized_new_price',kde=True)
plt.show()
In [62]:
sns.boxplot(data=df,x='normalized_used_price')
plt.show()
sns.boxplot(data=df,x='normalized_new_price')
plt.show()
In [63]:
df['os'].value_counts(normalize=True)*100
Out[63]:
os Android 93.051534 Others 3.966416 Windows 1.939780 iOS 1.042270 Name: proportion, dtype: float64
In [64]:
sns.countplot(data=df,x='os')
plt.show()
In [65]:
#The amount of RAM is important for the smooth functioning of a device. How does the amount of RAM vary with the brand?
sns.boxplot(data=df,x='brand_name',y='ram')
plt.xticks(rotation=90)
plt.show()
In [66]:
# How does the weight vary for phones and tablets offering large batteries(more than 4500 mAh)?
w=df[df['battery']>4500]
sns.barplot(data=w,x='os',y='weight')
plt.xticks(rotation=90)
plt.show()
sns.barplot(data=w,x='brand_name',y='weight')
plt.xticks(rotation=90)
plt.show()
sns.boxplot(data=w,x='weight')
plt.show()
In [67]:
#How many phones and tablets are available across different brands with a screen size larger than 6 inches(15.24 cm)?
w=df[df['screen_size']>15.24]
sns.countplot(data=w,x='brand_name')
plt.xticks(rotation=90)
plt.show()
sns.boxplot(data=w,x='screen_size')
plt.show()
In [68]:
#What is the distribution of devices offering greater than 8MP selfie cameras across brands?
w=df[df['selfie_camera_mp']>8]
sns.countplot(data=w,x='brand_name')
plt.xticks(rotation=90)
plt.show()
Univariate Analysis
In [69]:
plt.figure(figsize=(20,30))
cols=df.select_dtypes(include=np.number).columns.tolist()
for i, var in enumerate(cols):
plt.subplot(4,4,i+1)
sns.boxplot(data=df,x=var, showmeans=True,color='pink')
plt.tight_layout(pad=2)
plt.show()
In [70]:
plt.figure(figsize=(20,30))
cols=df.select_dtypes(include=np.number).columns.tolist()
for i, var in enumerate(cols):
mean_val = np.mean(df[var])
median_val = np.median(df[var])
plt.subplot(4,4,i+1)
sns.histplot(data=df,x=var,kde=True)
plt.axvline(mean_val, color='red', linestyle='dashed', linewidth=2)
plt.axvline(median_val, color='green', linestyle='dashed', linewidth=2)
plt.tight_layout(pad=2)
plt.show()
Bivariate Analysis
In [72]:
sns.pairplot(df,kind='scatter',plot_kws={'alpha':0.5})
plt.show()
In [73]:
sns.lmplot(x='normalized_new_price',y='normalized_used_price',data=df,scatter_kws={'alpha':0.2})
plt.show()
In [71]:
#Which attributes are highly correlated with the normalized price of a used device?
n=df.drop(['brand_name','os','4g','5g'],axis=1)
sns.heatmap(n.corr(),annot=True,cmap='viridis')
plt.show()
Data Processing
In [74]:
#checking for duplicate values
df.isnull().sum()
Out[74]:
brand_name 0 os 0 screen_size 0 4g 0 5g 0 main_camera_mp 179 selfie_camera_mp 2 int_memory 4 ram 4 battery 6 weight 7 release_year 0 days_used 0 normalized_used_price 0 normalized_new_price 0 dtype: int64
In [75]:
cols=['main_camera_mp','selfie_camera_mp','int_memory','ram','battery','weight']
for col in cols:
df[col]=df[col].fillna(df.groupby(['brand_name','release_year'])[col].transform('mean'))
df.isnull().sum()
Out[75]:
brand_name 0 os 0 screen_size 0 4g 0 5g 0 main_camera_mp 179 selfie_camera_mp 2 int_memory 0 ram 0 battery 6 weight 7 release_year 0 days_used 0 normalized_used_price 0 normalized_new_price 0 dtype: int64
In [76]:
for col in cols:
df[col]=df[col].fillna(df.groupby('brand_name')[col].transform('mean'))
df.isnull().sum()
Out[76]:
brand_name 0 os 0 screen_size 0 4g 0 5g 0 main_camera_mp 10 selfie_camera_mp 0 int_memory 0 ram 0 battery 0 weight 0 release_year 0 days_used 0 normalized_used_price 0 normalized_new_price 0 dtype: int64
In [77]:
for col in cols:
df[col]=df[col].fillna(df[col].mean())
df.isnull().sum()
Out[77]:
brand_name 0 os 0 screen_size 0 4g 0 5g 0 main_camera_mp 0 selfie_camera_mp 0 int_memory 0 ram 0 battery 0 weight 0 release_year 0 days_used 0 normalized_used_price 0 normalized_new_price 0 dtype: int64
In [78]:
#checking if dataset contains any duplicate values
df.duplicated().sum()
Out[78]:
np.int64(0)
Feature Engineering
In [79]:
#changing the release_year to years_since_released
df['years_since_released']=2025-df['release_year']
#df['years_since_released']=df['release_year'].apply(lambda x:2025-x)
df.head()
Out[79]:
| brand_name | os | screen_size | 4g | 5g | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | release_year | days_used | normalized_used_price | normalized_new_price | years_since_released | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Honor | Android | 14.50 | yes | no | 13.0 | 5.0 | 64.0 | 3.0 | 3020.0 | 146.0 | 2020 | 127 | 4.307572 | 4.715100 | 5 |
| 1 | Honor | Android | 17.30 | yes | yes | 13.0 | 16.0 | 128.0 | 8.0 | 4300.0 | 213.0 | 2020 | 325 | 5.162097 | 5.519018 | 5 |
| 2 | Honor | Android | 16.69 | yes | yes | 13.0 | 8.0 | 128.0 | 8.0 | 4200.0 | 213.0 | 2020 | 162 | 5.111084 | 5.884631 | 5 |
| 3 | Honor | Android | 25.50 | yes | yes | 13.0 | 8.0 | 64.0 | 6.0 | 7250.0 | 480.0 | 2020 | 345 | 5.135387 | 5.630961 | 5 |
| 4 | Honor | Android | 15.32 | yes | no | 13.0 | 8.0 | 64.0 | 3.0 | 5000.0 | 185.0 | 2020 | 293 | 4.389995 | 4.947837 | 5 |
In [80]:
df.drop('release_year',axis=1,inplace=True)
df.head()
Out[80]:
| brand_name | os | screen_size | 4g | 5g | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | days_used | normalized_used_price | normalized_new_price | years_since_released | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Honor | Android | 14.50 | yes | no | 13.0 | 5.0 | 64.0 | 3.0 | 3020.0 | 146.0 | 127 | 4.307572 | 4.715100 | 5 |
| 1 | Honor | Android | 17.30 | yes | yes | 13.0 | 16.0 | 128.0 | 8.0 | 4300.0 | 213.0 | 325 | 5.162097 | 5.519018 | 5 |
| 2 | Honor | Android | 16.69 | yes | yes | 13.0 | 8.0 | 128.0 | 8.0 | 4200.0 | 213.0 | 162 | 5.111084 | 5.884631 | 5 |
| 3 | Honor | Android | 25.50 | yes | yes | 13.0 | 8.0 | 64.0 | 6.0 | 7250.0 | 480.0 | 345 | 5.135387 | 5.630961 | 5 |
| 4 | Honor | Android | 15.32 | yes | no | 13.0 | 8.0 | 64.0 | 3.0 | 5000.0 | 185.0 | 293 | 4.389995 | 4.947837 | 5 |
In [81]:
#checking for outliers
plt.figure(figsize=(20,30))
cols=df.select_dtypes(include=np.number).columns.tolist()
for i, var in enumerate(cols):
plt.subplot(4,4,i+1)
sns.boxplot(data=df,x=var)
plt.tight_layout(pad=2)
plt.show()
In [82]:
#converting categorical data to numerical data
df=pd.get_dummies(df,columns=df.select_dtypes(include=['object','category']).columns.tolist(),drop_first=True)
df.head()
Out[82]:
| screen_size | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | days_used | normalized_used_price | normalized_new_price | ... | brand_name_Spice | brand_name_Vivo | brand_name_XOLO | brand_name_Xiaomi | brand_name_ZTE | os_Others | os_Windows | os_iOS | 4g_yes | 5g_yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14.50 | 13.0 | 5.0 | 64.0 | 3.0 | 3020.0 | 146.0 | 127 | 4.307572 | 4.715100 | ... | False | False | False | False | False | False | False | False | True | False |
| 1 | 17.30 | 13.0 | 16.0 | 128.0 | 8.0 | 4300.0 | 213.0 | 325 | 5.162097 | 5.519018 | ... | False | False | False | False | False | False | False | False | True | True |
| 2 | 16.69 | 13.0 | 8.0 | 128.0 | 8.0 | 4200.0 | 213.0 | 162 | 5.111084 | 5.884631 | ... | False | False | False | False | False | False | False | False | True | True |
| 3 | 25.50 | 13.0 | 8.0 | 64.0 | 6.0 | 7250.0 | 480.0 | 345 | 5.135387 | 5.630961 | ... | False | False | False | False | False | False | False | False | True | True |
| 4 | 15.32 | 13.0 | 8.0 | 64.0 | 3.0 | 5000.0 | 185.0 | 293 | 4.389995 | 4.947837 | ... | False | False | False | False | False | False | False | False | True | False |
5 rows × 49 columns
In [83]:
#explicitly mapping boolean values to 0 and 1
bool_cols=df.select_dtypes(include='bool').columns.tolist()
for col in bool_cols:
df[col]=df[col].map({True:1,False:0})
df.head()
Out[83]:
| screen_size | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | days_used | normalized_used_price | normalized_new_price | ... | brand_name_Spice | brand_name_Vivo | brand_name_XOLO | brand_name_Xiaomi | brand_name_ZTE | os_Others | os_Windows | os_iOS | 4g_yes | 5g_yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14.50 | 13.0 | 5.0 | 64.0 | 3.0 | 3020.0 | 146.0 | 127 | 4.307572 | 4.715100 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 17.30 | 13.0 | 16.0 | 128.0 | 8.0 | 4300.0 | 213.0 | 325 | 5.162097 | 5.519018 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| 2 | 16.69 | 13.0 | 8.0 | 128.0 | 8.0 | 4200.0 | 213.0 | 162 | 5.111084 | 5.884631 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| 3 | 25.50 | 13.0 | 8.0 | 64.0 | 6.0 | 7250.0 | 480.0 | 345 | 5.135387 | 5.630961 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| 4 | 15.32 | 13.0 | 8.0 | 64.0 | 3.0 | 5000.0 | 185.0 | 293 | 4.389995 | 4.947837 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 49 columns
Building Linear Regression Model
In [84]:
#defining predictors and traget variable
X=df.drop('normalized_used_price',axis=1)
y=df['normalized_used_price']
In [85]:
print(X.shape)
print(y.shape)
(3454, 48) (3454,)
In [86]:
#adding constant manually as statsmodels doesnot add constant on its own
X=sm.add_constant(X)
In [87]:
#spliting the dataset into train and testing set in test size 0.30
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)
In [88]:
print("Number of rows in train data =", X_train.shape[0])
print("Number of rows in test data =", X_test.shape[0])
Number of rows in train data = 2417 Number of rows in test data = 1037
In [89]:
#fit/train the data with statsmodels
olsmodel=sm.OLS(y_train,X_train).fit()
print(olsmodel.summary())
OLS Regression Results
=================================================================================
Dep. Variable: normalized_used_price R-squared: 0.849
Model: OLS Adj. R-squared: 0.846
Method: Least Squares F-statistic: 277.4
Date: Mon, 27 Jan 2025 Prob (F-statistic): 0.00
Time: 10:46:47 Log-Likelihood: 126.14
No. Observations: 2417 AIC: -154.3
Df Residuals: 2368 BIC: 129.4
Df Model: 48
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [0.025 0.975]
-----------------------------------------------------------------------------------------
const 1.5228 0.077 19.812 0.000 1.372 1.674
screen_size 0.0294 0.004 8.327 0.000 0.023 0.036
main_camera_mp 0.0234 0.002 14.933 0.000 0.020 0.026
selfie_camera_mp 0.0117 0.001 10.020 0.000 0.009 0.014
int_memory 0.0002 6.76e-05 2.759 0.006 5.4e-05 0.000
ram 0.0310 0.005 5.889 0.000 0.021 0.041
battery -1.666e-05 7.34e-06 -2.271 0.023 -3.11e-05 -2.27e-06
weight 0.0008 0.000 6.006 0.000 0.001 0.001
days_used 3.324e-05 3.07e-05 1.084 0.278 -2.69e-05 9.34e-05
normalized_new_price 0.4087 0.012 33.259 0.000 0.385 0.433
years_since_released -0.0254 0.005 -5.554 0.000 -0.034 -0.016
brand_name_Alcatel -0.0803 0.050 -1.617 0.106 -0.178 0.017
brand_name_Apple -0.0439 0.148 -0.297 0.766 -0.333 0.246
brand_name_Asus 0.0093 0.049 0.188 0.851 -0.087 0.106
brand_name_BlackBerry 0.0213 0.072 0.297 0.767 -0.120 0.162
brand_name_Celkon -0.2372 0.068 -3.486 0.000 -0.371 -0.104
brand_name_Coolpad -0.0269 0.071 -0.379 0.705 -0.166 0.112
brand_name_Gionee -0.0128 0.059 -0.219 0.827 -0.128 0.102
brand_name_Google -0.1179 0.083 -1.427 0.154 -0.280 0.044
brand_name_HTC -0.0401 0.050 -0.803 0.422 -0.138 0.058
brand_name_Honor -0.0496 0.051 -0.977 0.329 -0.149 0.050
brand_name_Huawei -0.0607 0.046 -1.318 0.188 -0.151 0.030
brand_name_Infinix 0.0958 0.113 0.845 0.398 -0.127 0.318
brand_name_Karbonn -0.0592 0.068 -0.869 0.385 -0.193 0.074
brand_name_LG -0.0608 0.047 -1.300 0.194 -0.152 0.031
brand_name_Lava -0.0218 0.063 -0.347 0.729 -0.145 0.102
brand_name_Lenovo -0.0367 0.047 -0.778 0.437 -0.129 0.056
brand_name_Meizu -0.0912 0.056 -1.623 0.105 -0.201 0.019
brand_name_Micromax -0.0649 0.049 -1.325 0.185 -0.161 0.031
brand_name_Microsoft 0.0737 0.082 0.902 0.367 -0.086 0.234
brand_name_Motorola -0.0691 0.051 -1.359 0.174 -0.169 0.031
brand_name_Nokia 0.0372 0.052 0.709 0.478 -0.066 0.140
brand_name_OnePlus -0.0651 0.073 -0.888 0.375 -0.209 0.079
brand_name_Oppo -0.0222 0.049 -0.453 0.650 -0.118 0.074
brand_name_Others -0.0681 0.044 -1.561 0.119 -0.154 0.017
brand_name_Panasonic -0.0452 0.062 -0.731 0.465 -0.166 0.076
brand_name_Realme -0.0381 0.063 -0.603 0.547 -0.162 0.086
brand_name_Samsung -0.0617 0.045 -1.376 0.169 -0.150 0.026
brand_name_Sony -0.0801 0.053 -1.514 0.130 -0.184 0.024
brand_name_Spice -0.0362 0.068 -0.528 0.597 -0.170 0.098
brand_name_Vivo -0.0653 0.050 -1.296 0.195 -0.164 0.033
brand_name_XOLO -0.0811 0.057 -1.416 0.157 -0.193 0.031
brand_name_Xiaomi 0.0325 0.050 0.655 0.512 -0.065 0.130
brand_name_ZTE -0.0456 0.048 -0.946 0.344 -0.140 0.049
os_Others -0.0597 0.033 -1.835 0.067 -0.123 0.004
os_Windows -0.0368 0.043 -0.848 0.396 -0.122 0.048
os_iOS -0.0125 0.148 -0.085 0.932 -0.302 0.277
4g_yes 0.0405 0.016 2.509 0.012 0.009 0.072
5g_yes -0.0835 0.032 -2.590 0.010 -0.147 -0.020
==============================================================================
Omnibus: 236.387 Durbin-Watson: 1.996
Prob(Omnibus): 0.000 Jarque-Bera (JB): 644.556
Skew: -0.536 Prob(JB): 1.09e-140
Kurtosis: 5.292 Cond. No. 1.85e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.85e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [90]:
# function to compute adjusted R-squared
def adj_r2_score(predictors, targets, predictions):
r2 = r2_score(targets, predictions)
n = predictors.shape[0]
k = predictors.shape[1]
return 1 - ((1 - r2) * (n - 1) / (n - k - 1))
# function to compute MAPE
def mape_score(targets, predictions):
return np.mean(np.abs(targets - predictions) / targets) * 100
# function to compute different metrics to check performance of a regression model
def model_performance_regression(model, predictors, target):
"""
Function to compute different metrics to check regression model performance
model: regressor
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
y_pred = model.predict(predictors)
r2 = r2_score(target, y_pred) # to compute R-squared
adjr2 = adj_r2_score(predictors, target, y_pred) # to compute adjusted R-squared
rmse = np.sqrt(mean_squared_error(target, y_pred)) # to compute RMSE
mae = mean_absolute_error(target, y_pred) # to compute MAE
mape = mape_score(target, y_pred) # to compute MAPE
mse=mean_squared_error(target,y_pred)
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"RMSE": rmse,
"MAE": mae,
"R-squared": r2,
"Adj. R-squared": adjr2,
"MAPE": mape,
"MSE":mse
},
index=[0],
)
return df_perf
In [91]:
# checking model performance on test set (seen 30% data)
print("Test Performance\n")
olsmodel_test_perf = model_performance_regression(olsmodel, X_test, y_test)
olsmodel_test_perf
Test Performance
Out[91]:
| RMSE | MAE | R-squared | Adj. R-squared | MAPE | MSE | |
|---|---|---|---|---|---|---|
| 0 | 0.238987 | 0.188661 | 0.832281 | 0.823954 | 4.512546 | 0.057115 |
Checking Linear Regression Assumptions
In [92]:
#checking for multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor
def check_vif(predictors):
vif=pd.DataFrame()
vif['feature']=predictors.columns
vif['VIF']=[variance_inflation_factor(predictors.values,i)
for i in range(predictors.shape[1])]
print(vif)
check_vif(X_train)
feature VIF 0 const 265.235338 1 screen_size 8.267208 2 main_camera_mp 2.441760 3 selfie_camera_mp 2.870106 4 int_memory 1.362897 5 ram 2.280539 6 battery 4.058464 7 weight 6.416315 8 days_used 2.580558 9 normalized_new_price 3.240086 10 years_since_released 4.880499 11 brand_name_Alcatel 3.458624 12 brand_name_Apple 11.194254 13 brand_name_Asus 3.651767 14 brand_name_BlackBerry 1.623856 15 brand_name_Celkon 1.873748 16 brand_name_Coolpad 1.575087 17 brand_name_Gionee 2.076651 18 brand_name_Google 1.388122 19 brand_name_HTC 3.460295 20 brand_name_Honor 3.560563 21 brand_name_Huawei 6.396085 22 brand_name_Infinix 1.191416 23 brand_name_Karbonn 1.628728 24 brand_name_LG 5.354505 25 brand_name_Lava 1.825906 26 brand_name_Lenovo 4.705291 27 brand_name_Meizu 2.419509 28 brand_name_Micromax 3.779651 29 brand_name_Microsoft 2.092660 30 brand_name_Motorola 3.487715 31 brand_name_Nokia 3.752838 32 brand_name_OnePlus 1.589083 33 brand_name_Oppo 4.286843 34 brand_name_Others 10.833598 35 brand_name_Panasonic 1.891099 36 brand_name_Realme 1.979198 37 brand_name_Samsung 8.013560 38 brand_name_Sony 2.895622 39 brand_name_Spice 1.638518 40 brand_name_Vivo 3.734876 41 brand_name_XOLO 2.162156 42 brand_name_Xiaomi 4.082231 43 brand_name_ZTE 4.342827 44 os_Others 1.884543 45 os_Windows 1.742215 46 os_iOS 10.036811 47 4g_yes 2.543265 48 5g_yes 1.816511
In [94]:
# Checking the effect of dropping the columns showing high multicollinearity on model performance
def check_multicollinear(predictors,target,high_vif_cols):
adj_r2=[]
rmse=[]
for cols in high_vif_cols:
#x_train = predictors[[col for col in predictors.columns if col not in high_vif_cols]]
x_train = predictors.loc[:, ~predictors.columns.str.startswith(cols)]
olsmodel = sm.OLS(target, x_train).fit()
adj_r2.append(olsmodel.rsquared_adj)
rmse.append(np.sqrt(olsmodel.mse_resid))
err_val = pd.DataFrame(
{
"col": high_vif_cols,
"Adj. R-squared after_dropping col": adj_r2,
"RMSE after dropping col": rmse,
}).sort_values(by="Adj. R-squared after_dropping col", ascending=False)
err_val.reset_index(drop=True, inplace=True)
return err_val
In [95]:
cols=['screen_size','weight', 'brand_name_Others','brand_name_Samsung','os_iOS','brand_name_Huawei','brand_name_LG','brand_name_Apple']
check_multicollinear(X_train,y_train,cols)
Out[95]:
| col | Adj. R-squared after_dropping col | RMSE after dropping col | |
|---|---|---|---|
| 0 | os_iOS | 0.846014 | 0.231982 |
| 1 | brand_name_Apple | 0.846009 | 0.231986 |
| 2 | brand_name_LG | 0.845905 | 0.232064 |
| 3 | brand_name_Huawei | 0.845902 | 0.232067 |
| 4 | brand_name_Samsung | 0.845891 | 0.232074 |
| 5 | brand_name_Others | 0.845856 | 0.232101 |
| 6 | weight | 0.843669 | 0.233742 |
| 7 | screen_size | 0.841506 | 0.235353 |
In [96]:
#dropping os_iOS to remove multicllinearity
cols_to_drop='brand_name_Apple'
X_train=X_train.loc[:,~X_train.columns.str.startswith(cols_to_drop)]
X_test=X_test.loc[:,~X_test.columns.str.startswith(cols_to_drop)]
check_vif(X_train)
feature VIF 0 const 264.362218 1 screen_size 8.211058 2 main_camera_mp 2.441255 3 selfie_camera_mp 2.842828 4 int_memory 1.362704 5 ram 2.269427 6 battery 4.054385 7 weight 6.403930 8 days_used 2.579837 9 normalized_new_price 3.223802 10 years_since_released 4.872671 11 brand_name_Alcatel 3.259928 12 brand_name_Asus 3.425740 13 brand_name_BlackBerry 1.552592 14 brand_name_Celkon 1.823235 15 brand_name_Coolpad 1.534053 16 brand_name_Gionee 1.996963 17 brand_name_Google 1.351230 18 brand_name_HTC 3.275810 19 brand_name_Honor 3.348338 20 brand_name_Huawei 5.927980 21 brand_name_Infinix 1.178253 22 brand_name_Karbonn 1.592841 23 brand_name_LG 4.999425 24 brand_name_Lava 1.776899 25 brand_name_Lenovo 4.410141 26 brand_name_Meizu 2.308117 27 brand_name_Micromax 3.591787 28 brand_name_Microsoft 2.045144 29 brand_name_Motorola 3.296308 30 brand_name_Nokia 3.510873 31 brand_name_OnePlus 1.538895 32 brand_name_Oppo 4.042084 33 brand_name_Others 10.062605 34 brand_name_Panasonic 1.826257 35 brand_name_Realme 1.903549 36 brand_name_Samsung 7.386128 37 brand_name_Sony 2.745310 38 brand_name_Spice 1.606129 39 brand_name_Vivo 3.509079 40 brand_name_XOLO 2.086904 41 brand_name_Xiaomi 3.837503 42 brand_name_ZTE 4.092439 43 os_Others 1.760957 44 os_Windows 1.741139 45 os_iOS 1.795776 46 4g_yes 2.542996 47 5g_yes 1.805394
In [97]:
cols=['screen_size','weight','brand_name_Huawei','brand_name_Others','brand_name_Samsung']
check_multicollinear(X_train,y_train,cols)
Out[97]:
| col | Adj. R-squared after_dropping col | RMSE after dropping col | |
|---|---|---|---|
| 0 | brand_name_Huawei | 0.845966 | 0.232018 |
| 1 | brand_name_Samsung | 0.845956 | 0.232026 |
| 2 | brand_name_Others | 0.845920 | 0.232053 |
| 3 | weight | 0.843714 | 0.233708 |
| 4 | screen_size | 0.841563 | 0.235311 |
In [98]:
cols_to_drop='brand_name_Huawei'
X_train=X_train.loc[:,~X_train.columns.str.startswith(cols_to_drop)]
X_test=X_test.loc[:,~X_test.columns.str.startswith(cols_to_drop)]
check_vif(X_train)
feature VIF 0 const 197.165240 1 screen_size 8.207623 2 main_camera_mp 2.441148 3 selfie_camera_mp 2.837717 4 int_memory 1.360540 5 ram 2.269420 6 battery 4.049528 7 weight 6.399444 8 days_used 2.579651 9 normalized_new_price 3.220844 10 years_since_released 4.872645 11 brand_name_Alcatel 1.403562 12 brand_name_Asus 1.382653 13 brand_name_BlackBerry 1.135060 14 brand_name_Celkon 1.293384 15 brand_name_Coolpad 1.091392 16 brand_name_Gionee 1.174860 17 brand_name_Google 1.069847 18 brand_name_HTC 1.382457 19 brand_name_Honor 1.367194 20 brand_name_Infinix 1.042740 21 brand_name_Karbonn 1.137312 22 brand_name_LG 1.634606 23 brand_name_Lava 1.165788 24 brand_name_Lenovo 1.541417 25 brand_name_Meizu 1.211174 26 brand_name_Micromax 1.514968 27 brand_name_Microsoft 1.648584 28 brand_name_Motorola 1.377890 29 brand_name_Nokia 1.737346 30 brand_name_OnePlus 1.111873 31 brand_name_Oppo 1.494075 32 brand_name_Others 2.514153 33 brand_name_Panasonic 1.144595 34 brand_name_Realme 1.181602 35 brand_name_Samsung 1.988369 36 brand_name_Sony 1.328348 37 brand_name_Spice 1.147461 38 brand_name_Vivo 1.392784 39 brand_name_XOLO 1.229257 40 brand_name_Xiaomi 1.446453 41 brand_name_ZTE 1.496585 42 os_Others 1.753870 43 os_Windows 1.740210 44 os_iOS 1.182128 45 4g_yes 2.529307 46 5g_yes 1.805104
In [99]:
cols=['screen_size','weight']
check_multicollinear(X_train,y_train,cols)
Out[99]:
| col | Adj. R-squared after_dropping col | RMSE after dropping col | |
|---|---|---|---|
| 0 | weight | 0.843644 | 0.233760 |
| 1 | screen_size | 0.841492 | 0.235364 |
In [100]:
cols_to_drop='screen_size'
X_train=X_train.loc[:,~X_train.columns.str.startswith(cols_to_drop)]
X_test=X_test.loc[:,~X_test.columns.str.startswith(cols_to_drop)]
check_vif(X_train)
feature VIF 0 const 167.728433 1 main_camera_mp 2.440872 2 selfie_camera_mp 2.832695 3 int_memory 1.354844 4 ram 2.269181 5 battery 3.752106 6 weight 2.847601 7 days_used 2.566639 8 normalized_new_price 3.166189 9 years_since_released 4.727538 10 brand_name_Alcatel 1.401579 11 brand_name_Asus 1.379825 12 brand_name_BlackBerry 1.134364 13 brand_name_Celkon 1.292674 14 brand_name_Coolpad 1.090708 15 brand_name_Gionee 1.166961 16 brand_name_Google 1.067661 17 brand_name_HTC 1.379211 18 brand_name_Honor 1.365423 19 brand_name_Infinix 1.042528 20 brand_name_Karbonn 1.136454 21 brand_name_LG 1.627115 22 brand_name_Lava 1.165777 23 brand_name_Lenovo 1.538901 24 brand_name_Meizu 1.209879 25 brand_name_Micromax 1.514403 26 brand_name_Microsoft 1.647430 27 brand_name_Motorola 1.368862 28 brand_name_Nokia 1.726944 29 brand_name_OnePlus 1.111841 30 brand_name_Oppo 1.491496 31 brand_name_Others 2.491646 32 brand_name_Panasonic 1.144573 33 brand_name_Realme 1.181067 34 brand_name_Samsung 1.979814 35 brand_name_Sony 1.323848 36 brand_name_Spice 1.142207 37 brand_name_Vivo 1.392414 38 brand_name_XOLO 1.229186 39 brand_name_Xiaomi 1.445235 40 brand_name_ZTE 1.492484 41 os_Others 1.491819 42 os_Windows 1.740073 43 os_iOS 1.177008 44 4g_yes 2.528732 45 5g_yes 1.801422
In [101]:
#fitting the data after removing multicollinearity
olsmodel=sm.OLS(y_train,X_train).fit()
print(olsmodel.summary())
OLS Regression Results
=================================================================================
Dep. Variable: normalized_used_price R-squared: 0.844
Model: OLS Adj. R-squared: 0.841
Method: Least Squares F-statistic: 286.0
Date: Mon, 27 Jan 2025 Prob (F-statistic): 0.00
Time: 10:50:36 Log-Likelihood: 90.134
No. Observations: 2417 AIC: -88.27
Df Residuals: 2371 BIC: 178.1
Df Model: 45
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [0.025 0.975]
-----------------------------------------------------------------------------------------
const 1.6858 0.062 27.190 0.000 1.564 1.807
main_camera_mp 0.0235 0.002 14.800 0.000 0.020 0.027
selfie_camera_mp 0.0121 0.001 10.265 0.000 0.010 0.014
int_memory 0.0001 6.84e-05 2.140 0.032 1.22e-05 0.000
ram 0.0313 0.005 5.882 0.000 0.021 0.042
battery -4.534e-07 7.16e-06 -0.063 0.949 -1.45e-05 1.36e-05
weight 0.0016 9.02e-05 18.161 0.000 0.001 0.002
days_used 1.553e-05 3.1e-05 0.501 0.617 -4.53e-05 7.64e-05
normalized_new_price 0.4223 0.012 34.269 0.000 0.398 0.446
years_since_released -0.0319 0.005 -6.992 0.000 -0.041 -0.023
brand_name_Alcatel -0.0200 0.032 -0.625 0.532 -0.083 0.043
brand_name_Asus 0.0488 0.031 1.590 0.112 -0.011 0.109
brand_name_BlackBerry 0.0603 0.061 0.988 0.323 -0.059 0.180
brand_name_Celkon -0.1984 0.057 -3.462 0.001 -0.311 -0.086
brand_name_Coolpad 0.0125 0.060 0.209 0.834 -0.105 0.130
brand_name_Gionee 0.0078 0.045 0.175 0.861 -0.080 0.095
brand_name_Google -0.0935 0.073 -1.272 0.204 -0.238 0.051
brand_name_HTC -0.0019 0.032 -0.061 0.951 -0.065 0.061
brand_name_Honor 0.0122 0.032 0.383 0.701 -0.050 0.075
brand_name_Infinix 0.1358 0.108 1.263 0.207 -0.075 0.347
brand_name_Karbonn 0.0032 0.058 0.056 0.955 -0.110 0.117
brand_name_LG -0.0242 0.026 -0.924 0.356 -0.075 0.027
brand_name_Lava 0.0268 0.051 0.525 0.600 -0.073 0.127
brand_name_Lenovo 0.0051 0.027 0.185 0.853 -0.049 0.059
brand_name_Meizu -0.0498 0.040 -1.237 0.216 -0.129 0.029
brand_name_Micromax -0.0200 0.031 -0.635 0.526 -0.082 0.042
brand_name_Microsoft 0.1071 0.074 1.456 0.145 -0.037 0.251
brand_name_Motorola -0.0387 0.032 -1.197 0.231 -0.102 0.025
brand_name_Nokia 0.0644 0.036 1.786 0.074 -0.006 0.135
brand_name_OnePlus -0.0096 0.062 -0.154 0.878 -0.132 0.112
brand_name_Oppo 0.0198 0.029 0.675 0.500 -0.038 0.077
brand_name_Others -0.0345 0.021 -1.624 0.105 -0.076 0.007
brand_name_Panasonic 0.0041 0.049 0.084 0.933 -0.091 0.100
brand_name_Realme 0.0060 0.050 0.120 0.904 -0.091 0.103
brand_name_Samsung -0.0229 0.023 -1.013 0.311 -0.067 0.021
brand_name_Sony -0.0464 0.036 -1.278 0.201 -0.118 0.025
brand_name_Spice -0.0191 0.058 -0.330 0.741 -0.133 0.094
brand_name_Vivo -0.0087 0.031 -0.278 0.781 -0.070 0.053
brand_name_XOLO -0.0343 0.044 -0.783 0.434 -0.120 0.052
brand_name_Xiaomi 0.0777 0.030 2.598 0.009 0.019 0.136
brand_name_ZTE -0.0066 0.029 -0.231 0.817 -0.063 0.050
os_Others -0.1610 0.029 -5.484 0.000 -0.219 -0.103
os_Windows -0.0326 0.044 -0.741 0.459 -0.119 0.054
os_iOS -0.0333 0.051 -0.648 0.517 -0.134 0.067
4g_yes 0.0369 0.016 2.261 0.024 0.005 0.069
5g_yes -0.0954 0.033 -2.930 0.003 -0.159 -0.032
==============================================================================
Omnibus: 237.495 Durbin-Watson: 1.996
Prob(Omnibus): 0.000 Jarque-Bera (JB): 609.795
Skew: -0.557 Prob(JB): 3.84e-133
Kurtosis: 5.195 Cond. No. 8.23e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 8.23e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
Dealing with high P values
In [103]:
#removing features with p_value greater than 0.5
cols = X_train.columns.tolist() # Get a list of column names
# Setting the initial max p-value
max_p_value = 1.0
threshold = 0.05 # Define p-value threshold
while len(cols) > 0:
# Define the train set with the current columns
x_train_aux = X_train[cols]
# Adding a constant to the model for the intercept
x_train_aux = sm.add_constant(x_train_aux)
# Fitting the OLS model
model = sm.OLS(y_train, x_train_aux).fit()
# Get the p-values and the maximum p-value
p_values = model.pvalues
max_p_value = max(p_values) # Find the maximum p-value
# Name of the feature with the highest p-value
feature_with_p_max = p_values.idxmax()
if max_p_value > threshold:
# Drop the feature with the highest p-value
cols.remove(feature_with_p_max)
else:
# All p-values are below threshold, stop the loop
break
# Store the selected features
selected_features = cols
print(selected_features)
['const', 'main_camera_mp', 'selfie_camera_mp', 'int_memory', 'ram', 'weight', 'normalized_new_price', 'years_since_released', 'brand_name_Asus', 'brand_name_Celkon', 'brand_name_Nokia', 'brand_name_Xiaomi', 'os_Others', '4g_yes', '5g_yes']
In [104]:
#updating train and test datasets with only selected features
X_train=X_train[selected_features]
X_test=X_test[selected_features]
In [105]:
olsmodel=sm.OLS(y_train,X_train).fit()
print(olsmodel.summary())
OLS Regression Results
=================================================================================
Dep. Variable: normalized_used_price R-squared: 0.843
Model: OLS Adj. R-squared: 0.842
Method: Least Squares F-statistic: 921.8
Date: Mon, 27 Jan 2025 Prob (F-statistic): 0.00
Time: 10:52:02 Log-Likelihood: 79.594
No. Observations: 2417 AIC: -129.2
Df Residuals: 2402 BIC: -42.33
Df Model: 14
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 1.6785 0.053 31.817 0.000 1.575 1.782
main_camera_mp 0.0234 0.001 16.048 0.000 0.021 0.026
selfie_camera_mp 0.0128 0.001 11.337 0.000 0.011 0.015
int_memory 0.0002 6.73e-05 2.307 0.021 2.33e-05 0.000
ram 0.0306 0.005 5.858 0.000 0.020 0.041
weight 0.0016 5.98e-05 27.359 0.000 0.002 0.002
normalized_new_price 0.4185 0.011 37.435 0.000 0.397 0.440
years_since_released -0.0299 0.003 -8.645 0.000 -0.037 -0.023
brand_name_Asus 0.0649 0.026 2.469 0.014 0.013 0.116
brand_name_Celkon -0.1896 0.054 -3.507 0.000 -0.296 -0.084
brand_name_Nokia 0.0702 0.030 2.348 0.019 0.012 0.129
brand_name_Xiaomi 0.0874 0.025 3.440 0.001 0.038 0.137
os_Others -0.1597 0.028 -5.722 0.000 -0.214 -0.105
4g_yes 0.0433 0.015 2.837 0.005 0.013 0.073
5g_yes -0.0893 0.032 -2.798 0.005 -0.152 -0.027
==============================================================================
Omnibus: 242.804 Durbin-Watson: 1.998
Prob(Omnibus): 0.000 Jarque-Bera (JB): 639.688
Skew: -0.561 Prob(JB): 1.24e-139
Kurtosis: 5.257 Cond. No. 2.49e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.49e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [106]:
#testing performance of dataset with new selected features
print("Test Performance\n")
olsmodel_test_perf = model_performance_regression(olsmodel, X_test, y_test)
olsmodel_test_perf
Test Performance
Out[106]:
| RMSE | MAE | R-squared | Adj. R-squared | MAPE | MSE | |
|---|---|---|---|---|---|---|
| 0 | 0.240541 | 0.189391 | 0.830093 | 0.827596 | 4.541333 | 0.05786 |
Test for Linearity
In [107]:
# let us create a dataframe with actual, fitted and residual values
df_pred = pd.DataFrame()
df_pred["Actual Values"] = y_train # actual values
df_pred["Predicted Values"] = olsmodel.fittedvalues # predicted values
df_pred["Residuals"] = olsmodel.resid # residuals
df_pred.head()
Out[107]:
| Actual Values | Predicted Values | Residuals | |
|---|---|---|---|
| 1744 | 4.261975 | 4.331470 | -0.069494 |
| 3141 | 4.175156 | 3.913484 | 0.261672 |
| 1233 | 4.117410 | 4.426274 | -0.308864 |
| 3046 | 3.782597 | 3.877856 | -0.095258 |
| 2649 | 3.981922 | 3.888998 | 0.092924 |
In [108]:
# REsidual plot for the Predicted values vs residuals to check linearity
sns.residplot( data=df_pred, x="Predicted Values", y="Residuals", color="blue", lowess=True)
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Predicted vs Residual plot")
plt.show()
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[108], line 3 1 # REsidual plot for the Predicted values vs residuals to check linearity ----> 3 sns.residplot( data=df_pred, x="Predicted Values", y="Residuals", color="blue", lowess=True) 4 plt.xlabel("Predicted Values") 5 plt.ylabel("Residuals") File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\seaborn\regression.py:939, in residplot(data, x, y, x_partial, y_partial, lowess, order, robust, dropna, label, color, scatter_kws, line_kws, ax) 937 scatter_kws = {} if scatter_kws is None else scatter_kws.copy() 938 line_kws = {} if line_kws is None else line_kws.copy() --> 939 plotter.plot(ax, scatter_kws, line_kws) 940 return ax File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\seaborn\regression.py:384, in _RegressionPlotter.plot(self, ax, scatter_kws, line_kws) 381 self.scatterplot(ax, scatter_kws) 383 if self.fit_reg: --> 384 self.lineplot(ax, line_kws) 386 # Label the axes 387 if hasattr(self.x, "name"): File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\seaborn\regression.py:429, in _RegressionPlotter.lineplot(self, ax, kws) 427 """Draw the model.""" 428 # Fit the regression model --> 429 grid, yhat, err_bands = self.fit_regression(ax) 430 edges = grid[0], grid[-1] 432 # Get set default aesthetics File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\seaborn\regression.py:198, in _RegressionPlotter.fit_regression(self, ax, x_range, grid) 196 def fit_regression(self, ax=None, x_range=None, grid=None): 197 """Fit the regression model.""" --> 198 self._check_statsmodels() 200 # Create the grid for the regression 201 if grid is None: File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\seaborn\regression.py:194, in _RegressionPlotter._check_statsmodels(self) 192 for option in options: 193 if getattr(self, option) and not _has_statsmodels: --> 194 raise RuntimeError(err.format(option)) RuntimeError: `lowess=True` requires statsmodels, an optional dependency, to be installed.
Test For Normality
In [110]:
#histogram for residuals
sns.histplot(data=df_pred, x="Residuals", kde=True)
plt.title("Normality of residuals")
plt.show()
In [111]:
import scipy.stats as stats
stats.probplot(df_pred['Residuals'],dist='norm',plot=plt )
plt.show()
In [112]:
stats.shapiro(df_pred["Residuals"])
Out[112]:
ShapiroResult(statistic=np.float64(0.9649669821290006), pvalue=np.float64(8.790640188232544e-24))
Test For Homoscedacity
In [114]:
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
name = ["F statistic", "p-value"]
test = sms.het_goldfeldquandt(df_pred["Residuals"], X_train)
lzip(name, test)
Out[114]:
[('F statistic', np.float64(0.9485456177187723)),
('p-value', np.float64(0.8191750280200132))]
Final Model Summary
In [116]:
#comparing preicted and actual values
pred = olsmodel.predict(X_test)
df_pred_test = pd.DataFrame({"Actual": y_test, "Predicted": pred})
df_pred_test.sample(10, random_state=1)
Out[116]:
| Actual | Predicted | |
|---|---|---|
| 1174 | 4.553772 | 4.502079 |
| 2247 | 4.330996 | 4.055695 |
| 2991 | 4.180828 | 4.101595 |
| 2997 | 4.291007 | 4.265098 |
| 2982 | 4.649761 | 4.545661 |
| 479 | 4.284689 | 4.232865 |
| 17 | 4.777020 | 4.693575 |
| 471 | 4.721886 | 4.496748 |
| 2918 | 4.482550 | 4.475868 |
| 602 | 3.852485 | 4.161186 |
In [118]:
#final Training perfromance
model_performance_regression(olsmodel,X_train,y_train)
Out[118]:
| RMSE | MAE | R-squared | Adj. R-squared | MAPE | MSE | |
|---|---|---|---|---|---|---|
| 0 | 0.234132 | 0.181735 | 0.843082 | 0.842101 | 4.384008 | 0.054818 |
In [119]:
#final testing perfromance
model_performance_regression(olsmodel,X_test,y_test)
Out[119]:
| RMSE | MAE | R-squared | Adj. R-squared | MAPE | MSE | |
|---|---|---|---|---|---|---|
| 0 | 0.240541 | 0.189391 | 0.830093 | 0.827596 | 4.541333 | 0.05786 |
In [120]:
olsmodel=sm.OLS(y_train,X_train).fit()
print(olsmodel.summary())
OLS Regression Results
=================================================================================
Dep. Variable: normalized_used_price R-squared: 0.843
Model: OLS Adj. R-squared: 0.842
Method: Least Squares F-statistic: 921.8
Date: Mon, 27 Jan 2025 Prob (F-statistic): 0.00
Time: 10:57:22 Log-Likelihood: 79.594
No. Observations: 2417 AIC: -129.2
Df Residuals: 2402 BIC: -42.33
Df Model: 14
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 1.6785 0.053 31.817 0.000 1.575 1.782
main_camera_mp 0.0234 0.001 16.048 0.000 0.021 0.026
selfie_camera_mp 0.0128 0.001 11.337 0.000 0.011 0.015
int_memory 0.0002 6.73e-05 2.307 0.021 2.33e-05 0.000
ram 0.0306 0.005 5.858 0.000 0.020 0.041
weight 0.0016 5.98e-05 27.359 0.000 0.002 0.002
normalized_new_price 0.4185 0.011 37.435 0.000 0.397 0.440
years_since_released -0.0299 0.003 -8.645 0.000 -0.037 -0.023
brand_name_Asus 0.0649 0.026 2.469 0.014 0.013 0.116
brand_name_Celkon -0.1896 0.054 -3.507 0.000 -0.296 -0.084
brand_name_Nokia 0.0702 0.030 2.348 0.019 0.012 0.129
brand_name_Xiaomi 0.0874 0.025 3.440 0.001 0.038 0.137
os_Others -0.1597 0.028 -5.722 0.000 -0.214 -0.105
4g_yes 0.0433 0.015 2.837 0.005 0.013 0.073
5g_yes -0.0893 0.032 -2.798 0.005 -0.152 -0.027
==============================================================================
Omnibus: 242.804 Durbin-Watson: 1.998
Prob(Omnibus): 0.000 Jarque-Bera (JB): 639.688
Skew: -0.561 Prob(JB): 1.24e-139
Kurtosis: 5.257 Cond. No. 2.49e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.49e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]: